library(data.table)
library(tidyverse)
train <- fread("../data/train.csv", encoding = "UTF-8") %>%
select(-c(Id, property_type, operation_type, currency)) %>%
mutate(rooms = factor(rooms),
bedrooms = factor(bedrooms),
bathrooms = factor(bathrooms))
head(train)library(tidyverse)
library(treemap)
train %>%
group_by(pais, provincia_departamento) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","provincia_departamento"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - Departamento",
fontsize.title = 12
)train %>%
group_by(pais, rooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","rooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Salas",
fontsize.title = 12
)train %>%
group_by(pais, bedrooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","bedrooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Dormitorios",
fontsize.title = 12
)train %>%
group_by(pais, bathrooms) %>%
count(name = "total") %>%
treemap(.,
index = c("pais","bathrooms"),
vSize = "total",
type = "index",
palette = c("#1C8356", "#C4451C"),
title = "Tamaño muestral: País - # de Baños",
fontsize.title = 12
)library(ggthemes)
train %>%
select(rooms, bedrooms, bathrooms) %>%
gather() %>%
group_by(key, value) %>%
count(name = "Total") %>%
ggplot(aes(x = value, y = Total)) +
facet_wrap(~key, scales = "free") +
geom_point(size = 2, color = "#C4451C") +
geom_segment(aes(y = 0, xend = value, yend = Total), color = "#1C8356") +
scale_x_continuous(n.breaks = 10) +
theme_fivethirtyeight()train %>%
select(rooms, price, surface_total) %>%
mutate(rooms = factor(rooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(rooms)) %>%
ggplot(aes(x = rooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Habitaciones")train %>%
select(bedrooms, price, surface_total) %>%
mutate(bedrooms = factor(bedrooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(bedrooms)) %>%
ggplot(aes(x = bedrooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Dormitorios")train %>%
select(bathrooms, price, surface_total) %>%
mutate(bathrooms = factor(bathrooms)) %>%
mutate(priceLog = log(price),
surfaceLog = log(surface_total)) %>%
gather(key = "key", value = "valor", -c(bathrooms)) %>%
ggplot(aes(x = bathrooms, y = valor)) +
facet_wrap(~key, scales = "free") +
geom_boxplot(outlier.alpha = 0.01, fill = "#1C8356", alpha = 0.5,
color = "#C4451C", size = 0.1) +
stat_summary(fun.y = mean, geom = "point", color = "#C4451C", size = 2,
shape = 17) +
theme_fivethirtyeight() +
labs(caption = "Triángulo = promedio", subtitle = "Baños")geom_bin2d() en lugar de geom_point().train %>%
ggplot(aes(x = surface_total, y = price)) +
geom_bin2d(color = "white", alpha = 0.8) +
scale_fill_gradient2(low = "white", mid = "#1C8356", high = "#C4451C") +
geom_smooth(method = "lm", color = "#C4451C", size = 2, se = FALSE) +
theme_fivethirtyeight() +
theme(legend.position = "right", legend.direction = "vertical")library(tidymodels)
set.seed(123)
datosTrain <- train %>%
select(-c(Id, property_type, operation_type, currency)) %>%
mutate(rooms = factor(rooms),
bedrooms = factor(bedrooms),
bathrooms = factor(bathrooms))
split_inicial <- initial_split(
data = datosTrain,
prop = 0.8,
strata = price
)
datos_train <- training(split_inicial)
datos_test <- testing(split_inicial)# Modelo
mod_glm <- linear_reg(mode = "regression",
penalty = tune(),
mixture = tune()) %>%
set_engine(engine = "glmnet")
# Preprocesamiento
receta <- recipe(formula = price ~ .,
data = datos_train) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes())
# Validación del modelo: validación cruzada K-folds con k = 10
set.seed(1992)
crossVal <- vfold_cv(data = datos_train,
v = 10,
strata = price)
# WORKFLOW
# =============================================================================
flujo_modelo <- workflow() %>%
add_recipe(receta) %>%
add_model(mod_glm)
# Grid de hiperparámetros
hiperpar_grid <- grid_regular(
penalty(range = c(0, 1), trans = NULL),
mixture(range = c(0, 1), trans = NULL),
levels = c(10, 10))
# EJECUCIÓN DE LA OPTIMIZACIÓN DE HIPERPARÁMETROS
# =============================================================================
registerDoParallel(cores = parallel::detectCores() - 1)
myGrid <- tune_grid(
object = flujo_modelo,
resamples = crossVal,
metrics = metric_set(rmse),
control = control_resamples(save_pred = TRUE),
grid = hiperpar_grid
)
stopImplicitCluster()predicciones <- glm_final %>%
predict(new_data = datos_test,
type = "numeric")
predicciones[is.na(predicciones)] <- 0predicciones <- predicciones %>%
bind_cols(datos_test %>% select(price))
error_test_glm <- rmse(
data = predicciones,
truth = price,
estimate = .pred,
na_rm = TRUE
) %>%
mutate(modelo = "GLM")
error_test_glmprediccionesGLM_Subm1 <- glm_final %>%
predict(new_data = test,
type = "numeric")
prediccionesGLM_Subm1[is.na(prediccionesGLM_Subm1)] <- 0
prediccionesGLM_Subm1[prediccionesGLM_Subm1 < 0 ] <- 0
hist(prediccionesGLM_Subm1$.pred)subm1_glmnet <- data.frame(Id = sampleSub$Id,
price = prediccionesGLM_Subm1$.pred)
write.csv(subm1_glmnet, file = "Subm1.csv", row.names = FALSE)# Capitales para Colombia y Argentina
capitales_colombia <- c("Armenia", "Barranquilla", "Bogotá D.C", "Bucaramanga",
"Cali", "Cartagena", "Ibagué", "Medellín", "Neiva",
"Popayán", "Santa Marta", "Tunja")
capitales_argentina <- c("La Plata", "Córdoba", "Corrientes", "Paraná",
"San Salvador de Jujuy", "Mendoza", "Posadas", "Neuquén",
"Salta", "San Juan", "San Luis", "Santa Fe", "San Miguel")
train %>%
mutate(rooms = as.integer(as.character(rooms)),
bedrooms = as.integer(as.character(bedrooms)),
bathrooms = as.integer(as.character(bathrooms))) %>%
mutate(Capital = if_else(pais == "Argentina" & ciudad %in% capitales_argentina,
true = "Si",
false = if_else(pais == "Colombia" & ciudad %in% capitales_colombia,
true = "Si", false = "No")),
sumaRow = rooms + bedrooms + bathrooms + surface_total,
mediaRow = sumaRow/4) %>%
mutate(surfaceClass = if_else(surface_total <= 55, true = "Pequeña",
false = if_else(
surface_total > 55 & surface_total <= 105,
true = "Mediana",
false = "Grande"
))) ->
newTrain1
newTrain1 test %>%
mutate(rooms = as.integer(as.character(rooms)),
bedrooms = as.integer(as.character(bedrooms)),
bathrooms = as.integer(as.character(bathrooms))) %>%
mutate(Capital = if_else(pais == "Argentina" & ciudad %in% capitales_argentina,
true = "Si",
false = if_else(pais == "Colombia" & ciudad %in% capitales_colombia,
true = "Si", false = "No")),
sumaRow = rooms + bedrooms + bathrooms + surface_total,
mediaRow = sumaRow/4) %>%
mutate(surfaceClass = if_else(surface_total <= 55, true = "Pequeña",
false = if_else(
surface_total > 55 & surface_total <= 105,
true = "Mediana",
false = "Grande"
))) ->
newTest1
newTest1 # ------- Test ----
load("newTest1.Rdata")
test <- newTest1 %>%
mutate(rooms = factor(rooms),
bedrooms = factor(bedrooms),
bathrooms = factor(bathrooms)) %>%
mutate_if(is.character, as.factor) %>%
as.data.frame()
test_dummy <- dummy_cols(test) %>%
select(-c(rooms, bedrooms, bathrooms, pais, provincia_departamento,
ciudad, Capital, surfaceClass))
# ------- Train ----
load("newTrain1.Rdata")
train <- newTrain1 %>%
mutate(rooms = factor(rooms),
bedrooms = factor(bedrooms),
bathrooms = factor(bathrooms)) %>%
mutate_if(is.character, as.factor) %>%
relocate(price) %>%
as.data.frame()
train_dummy <- dummy_cols(train) %>%
select(-c(rooms, bedrooms, bathrooms, pais, provincia_departamento,
ciudad, Capital, surfaceClass))
train_dummy2 <- train_dummy[, names(train_dummy) %in% names(test_dummy)]
train_dummy2$price <- train$price
test_dummy2 <- test_dummy[, names(test_dummy) %in% names(train_dummy2)]
train_dummy2